Please upvote this if you find it useful
import pandas as pd
# Some sklearn tools for preprocessing and building a pipeline.
# ColumnTransformer was introduced in 0.20 so make sure you have this version
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
# Our algorithms, by from the easiest to the hardest to intepret.
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
Note: This notebook uses features introduced in Python 3.6 and sklearn 0.20.
The dataset can be downloaded here. It consists of data from marketing campaigns of a Portuguese bank. We will try to build classifiers that can predict whether or not the client targeted by the campaign ended up subscribing to a term deposit (column y).
import pandas as pd
df = pd.read_csv(r'C:\Users\Ramsey\Downloads\bank-additional-full.csv')
df = pd.read_csv(r'C:\Users\Ramsey\Downloads\bank-additional-full.csv', sep = ';')
df.y.value_counts()
y no 36548 yes 4640 Name: count, dtype: int64
The dataset is imbalanced, we will need to keep that in mind when building our models!
# Get X, y
y = df["y"].map({"no":0, "yes":1})
X = df.drop("y", axis=1)
Let's look at the features in the X matrix:
Note the comment about duration feature. We will exclude it from our analysis.
X.drop("duration", inplace=True, axis=1)
X.dtypes
age int64 job object marital object education object default object housing object loan object contact object month object day_of_week object campaign int64 pdays int64 previous int64 poutcome object emp.var.rate float64 cons.price.idx float64 cons.conf.idx float64 euribor3m float64 nr.employed float64 dtype: object
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
df = pd.read_csv(r'C:\Users\Ramsey\Downloads\bank-additional-full.csv')
num_features = df.select_dtypes(include=['int64', 'float64']).columns
cat_features = df.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer([("numerical", "passthrough", num_features),
("categorical", OneHotEncoder(sparse=False, handle_unknown="ignore"),
cat_features)])
# Some such as default would be binary features, but since
# they have a third class "unknown" we'll process them as non binary categorical
num_features = ["age", "campaign", "pdays", "previous", "emp.var.rate",
"cons.price.idx", "cons.conf.idx","euribor3m", "nr.employed"]
cat_features = ["job", "marital", "education","default", "housing", "loan",
"contact", "month", "day_of_week", "poutcome"]
ColumnTransformer object (new in sklearn 0.20) that keeps our numerical features and apply one hot encoding on our categorical features. That will allow us to create a clean pipeline that includes both features engineering (one hot encoding here) and training the model (a nice way to avoid data leakage)preprocessor = ColumnTransformer([("numerical", "passthrough", num_features),
("categorical", OneHotEncoder(sparse=False, handle_unknown="ignore"),
cat_features)])
Now we can define our 4 models as sklearn Pipeline object, containing our preprocessing step and training of one given algorithm.
# Logistic Regression
lr_model = Pipeline([("preprocessor", preprocessor),
("model", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))])
# Decision Tree
dt_model = Pipeline([("preprocessor", preprocessor),
("model", DecisionTreeClassifier(class_weight="balanced"))])
# Random Forest
rf_model = Pipeline([("preprocessor", preprocessor),
("model", RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1))])
# XGBoost
xgb_model = Pipeline([("preprocessor", preprocessor),
# Add a scale_pos_weight to make it balanced
("model", XGBClassifier(scale_pos_weight=(1 - y.mean()), n_jobs=-1))])
Let's split the data into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.3, random_state=42)
We're good to go!
First let's fine tune our logistic regression and evaluate its performance.
gs = GridSearchCV(lr_model, {"model__C": [1, 1.3, 1.5]}, n_jobs=-1, cv=5, scoring="accuracy")
gs.fit(X_train, y_train)
`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical',
'passthrough',
['age',
'campaign',
'pdays',
'previous',
'emp.var.rate',
'cons.price.idx',
'cons.conf.idx',
'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job',
'marital',
'education',
'default',
'housing',
'loan',
'contact',
'month',
'day_of_week',
'poutcome'])])),
('model',
LogisticRegression(class_weight='balanced',
random_state=42,
solver='liblinear'))]),
n_jobs=-1, param_grid={'model__C': [1, 1.3, 1.5]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical',
'passthrough',
['age',
'campaign',
'pdays',
'previous',
'emp.var.rate',
'cons.price.idx',
'cons.conf.idx',
'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job',
'marital',
'education',
'default',
'housing',
'loan',
'contact',
'month',
'day_of_week',
'poutcome'])])),
('model',
LogisticRegression(class_weight='balanced',
random_state=42,
solver='liblinear'))]),
n_jobs=-1, param_grid={'model__C': [1, 1.3, 1.5]},
scoring='accuracy')Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model',
LogisticRegression(class_weight='balanced', random_state=42,
solver='liblinear'))])ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])])['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
passthrough
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
OneHotEncoder(handle_unknown='ignore', sparse=False)
LogisticRegression(class_weight='balanced', random_state=42, solver='liblinear')
Let's see our best parameters and score
print(gs.best_params_)
print(gs.best_score_)
{'model__C': 1}
0.8276854248829608
lr_model.set_params(**gs.best_params_)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model',
LogisticRegression(C=1, class_weight='balanced',
random_state=42, solver='liblinear'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model',
LogisticRegression(C=1, class_weight='balanced',
random_state=42, solver='liblinear'))])ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])])['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
passthrough
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
OneHotEncoder(handle_unknown='ignore', sparse=False)
LogisticRegression(C=1, class_weight='balanced', random_state=42,
solver='liblinear')lr_model.get_params("model")
{'memory': None,
'steps': [('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])])),
('model',
LogisticRegression(C=1, class_weight='balanced', random_state=42,
solver='liblinear'))],
'verbose': False,
'preprocessor': ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])]),
'model': LogisticRegression(C=1, class_weight='balanced', random_state=42,
solver='liblinear'),
'preprocessor__n_jobs': None,
'preprocessor__remainder': 'drop',
'preprocessor__sparse_threshold': 0.3,
'preprocessor__transformer_weights': None,
'preprocessor__transformers': [('numerical',
'passthrough',
['age',
'campaign',
'pdays',
'previous',
'emp.var.rate',
'cons.price.idx',
'cons.conf.idx',
'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore', sparse=False),
['job',
'marital',
'education',
'default',
'housing',
'loan',
'contact',
'month',
'day_of_week',
'poutcome'])],
'preprocessor__verbose': False,
'preprocessor__verbose_feature_names_out': True,
'preprocessor__numerical': 'passthrough',
'preprocessor__categorical': OneHotEncoder(handle_unknown='ignore', sparse=False),
'preprocessor__categorical__categories': 'auto',
'preprocessor__categorical__drop': None,
'preprocessor__categorical__dtype': numpy.float64,
'preprocessor__categorical__feature_name_combiner': 'concat',
'preprocessor__categorical__handle_unknown': 'ignore',
'preprocessor__categorical__max_categories': None,
'preprocessor__categorical__min_frequency': None,
'preprocessor__categorical__sparse': False,
'preprocessor__categorical__sparse_output': True,
'model__C': 1,
'model__class_weight': 'balanced',
'model__dual': False,
'model__fit_intercept': True,
'model__intercept_scaling': 1,
'model__l1_ratio': None,
'model__max_iter': 100,
'model__multi_class': 'auto',
'model__n_jobs': None,
'model__penalty': 'l2',
'model__random_state': 42,
'model__solver': 'liblinear',
'model__tol': 0.0001,
'model__verbose': 0,
'model__warm_start': False}
Now we can fit the model on the whole training set and calculate accuracy on the test set.
lr_model.fit(X_train, y_train)
`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model',
LogisticRegression(C=1, class_weight='balanced',
random_state=42, solver='liblinear'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model',
LogisticRegression(C=1, class_weight='balanced',
random_state=42, solver='liblinear'))])ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])])['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
passthrough
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
OneHotEncoder(handle_unknown='ignore', sparse=False)
LogisticRegression(C=1, class_weight='balanced', random_state=42,
solver='liblinear')Generate predictions
y_pred = lr_model.predict(X_test)
accuracy_score(y_test, y_pred)
0.8322408351541636
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.95 0.86 0.90 10965
1 0.36 0.64 0.46 1392
accuracy 0.83 12357
macro avg 0.66 0.75 0.68 12357
weighted avg 0.88 0.83 0.85 12357
Let's use eli5 to visualise the weights associated to each feature:
import eli5
eli5.show_weights(lr_model.named_steps["model"])
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) c:\Users\Ramsey\Downloads\responsible-ai-model-explainability.ipynb Cell 44 line 1 ----> <a href='vscode-notebook-cell:/c%3A/Users/Ramsey/Downloads/responsible-ai-model-explainability.ipynb#X61sZmlsZQ%3D%3D?line=0'>1</a> import eli5 <a href='vscode-notebook-cell:/c%3A/Users/Ramsey/Downloads/responsible-ai-model-explainability.ipynb#X61sZmlsZQ%3D%3D?line=1'>2</a> eli5.show_weights(lr_model.named_steps["model"]) File c:\Users\Ramsey\AppData\Local\Programs\Python\Python311\Lib\site-packages\eli5\__init__.py:13 6 from .formatters import ( 7 format_as_html, 8 format_html_styles, 9 format_as_text, 10 format_as_dict, 11 ) 12 from .explain import explain_weights, explain_prediction ---> 13 from .sklearn import explain_weights_sklearn, explain_prediction_sklearn 14 from .transform import transform_feature_names 17 try: File c:\Users\Ramsey\AppData\Local\Programs\Python\Python311\Lib\site-packages\eli5\sklearn\__init__.py:3 1 # -*- coding: utf-8 -*- 2 from __future__ import absolute_import ----> 3 from .explain_weights import ( 4 explain_weights_sklearn, 5 explain_linear_classifier_weights, 6 explain_linear_regressor_weights, 7 explain_rf_feature_importance, 8 explain_decision_tree, 9 ) 10 from .explain_prediction import ( 11 explain_prediction_sklearn, 12 explain_prediction_linear_classifier, 13 explain_prediction_linear_regressor, 14 ) 15 from .unhashing import ( 16 InvertableHashingVectorizer, 17 FeatureUnhasher, 18 invert_hashing_and_fit, 19 ) File c:\Users\Ramsey\AppData\Local\Programs\Python\Python311\Lib\site-packages\eli5\sklearn\explain_weights.py:78 73 from eli5.transform import transform_feature_names 74 from eli5._feature_importances import ( 75 get_feature_importances_filtered, 76 get_feature_importance_explanation, 77 ) ---> 78 from .permutation_importance import PermutationImportance 81 LINEAR_CAVEATS = """ 82 Caveats: 83 1. Be careful with features which are not (...) 90 classification result for most examples. 91 """.lstrip() 93 HASHING_CAVEATS = """ 94 Feature names are restored from their hashes; this is not 100% precise 95 because collisions are possible. For known collisions possible feature names (...) 99 the result is positive. 100 """.lstrip() File c:\Users\Ramsey\AppData\Local\Programs\Python\Python311\Lib\site-packages\eli5\sklearn\permutation_importance.py:7 5 import numpy as np 6 from sklearn.model_selection import check_cv ----> 7 from sklearn.utils.metaestimators import if_delegate_has_method 8 from sklearn.utils import check_array, check_random_state 9 from sklearn.base import ( 10 BaseEstimator, 11 MetaEstimatorMixin, 12 clone, 13 is_classifier 14 ) ImportError: cannot import name 'if_delegate_has_method' from 'sklearn.utils.metaestimators' (c:\Users\Ramsey\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\metaestimators.py)
That gives us the weights associated to each feature, that can be seen as the contribution of each feature into predicting that the class will be y=1 (the client will subscribe after the campaign).
The names for each features aren't really helping though, we can pass a list of column names to eli5 but we'll need to do a little gymnastics first to extract names from our preprocessor in the pipeline (since we've generated new features on the fly with the one hot encoder)
preprocessor = lr_model.named_steps["preprocessor"]
ohe_categories = preprocessor.named_transformers_["categorical"].categories_
new_ohe_features = [f"{col}__{val}" for col, vals in zip(cat_features, ohe_categories) for val in vals]
all_features = num_features + new_ohe_features
Great, so now we have a nice list of columns after processing. Let's visualise the data in a dataframe just for sanity check:
pd.DataFrame(lr_model.named_steps["preprocessor"].transform(X_train), columns=all_features).head()
| age | campaign | pdays | previous | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | job__admin. | ... | month__oct | month__sep | day_of_week__fri | day_of_week__mon | day_of_week__thu | day_of_week__tue | day_of_week__wed | poutcome__failure | poutcome__nonexistent | poutcome__success | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | 2.0 | 999.0 | 0.0 | 1.1 | 93.994 | -36.4 | 4.860 | 5191.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1 | 51.0 | 5.0 | 999.0 | 0.0 | 1.1 | 93.994 | -36.4 | 4.858 | 5191.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2 | 46.0 | 2.0 | 999.0 | 0.0 | -1.8 | 92.893 | -46.2 | 1.244 | 5099.1 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 46.0 | 1.0 | 999.0 | 0.0 | 1.4 | 94.465 | -41.8 | 4.961 | 5228.1 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 25.0 | 5.0 | 999.0 | 0.0 | -1.8 | 92.893 | -46.2 | 1.266 | 5099.1 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
5 rows × 62 columns
Looks good!
eli5.show_weights(lr_model.named_steps["model"], feature_names=all_features)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) c:\Users\Ramsey\Downloads\responsible-ai-model-explainability.ipynb Cell 53 line 1 ----> <a href='vscode-notebook-cell:/c%3A/Users/Ramsey/Downloads/responsible-ai-model-explainability.ipynb#Y103sZmlsZQ%3D%3D?line=0'>1</a> eli5.show_weights(lr_model.named_steps["model"], feature_names=all_features) NameError: name 'eli5' is not defined
Looks like it's picking principally on whether the month is march or not, the marketting campaign seem to have been more efficient in march?
We can also use eli5 to explain a specific prediction, let's pick a row in the test data:
i = 4
X_test.iloc[[i]]
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 39993 | 27 | unknown | single | university.degree | no | yes | no | cellular | jun | wed | 4 | 3 | 2 | success | -1.7 | 94.055 | -39.8 | 0.767 | 4991.6 |
y_test.iloc[i]
0
Our client subsribed to the term deposit after the campaign! Let's see what our model would have predicted and how it would explain it.
We'll need to first transform our row into the format expected by our model as eli5 cannot work directly with our pipeline.
Note: eli5 actually does support pipeline, but with a limited number of transformations only. In our pipeline it does not support the passthrough transformation (which, funny enough, doesn't do anything...)
eli5.show_prediction(lr_model.named_steps["model"],
lr_model.named_steps["preprocessor"].transform(X_test)[i],
feature_names=all_features, show_feature_values=True)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) c:\Users\Ramsey\Downloads\responsible-ai-model-explainability.ipynb Cell 59 line 1 ----> <a href='vscode-notebook-cell:/c%3A/Users/Ramsey/Downloads/responsible-ai-model-explainability.ipynb#Y112sZmlsZQ%3D%3D?line=0'>1</a> eli5.show_prediction(lr_model.named_steps["model"], <a href='vscode-notebook-cell:/c%3A/Users/Ramsey/Downloads/responsible-ai-model-explainability.ipynb#Y112sZmlsZQ%3D%3D?line=1'>2</a> lr_model.named_steps["preprocessor"].transform(X_test)[i], <a href='vscode-notebook-cell:/c%3A/Users/Ramsey/Downloads/responsible-ai-model-explainability.ipynb#Y112sZmlsZQ%3D%3D?line=2'>3</a> feature_names=all_features, show_feature_values=True) NameError: name 'eli5' is not defined
eli5 can also be used to intepret decision trees:
gs = GridSearchCV(dt_model, {"model__max_depth": [3, 5, 7],
"model__min_samples_split": [2, 5]},
n_jobs=-1, cv=5, scoring="accuracy")
gs.fit(X_train, y_train)
`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical',
'passthrough',
['age',
'campaign',
'pdays',
'previous',
'emp.var.rate',
'cons.price.idx',
'cons.conf.idx',
'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job',
'marital',
'education',
'default',
'housing',
'loan',
'contact',
'month',
'day_of_week',
'poutcome'])])),
('model',
DecisionTreeClassifier(class_weight='balanced'))]),
n_jobs=-1,
param_grid={'model__max_depth': [3, 5, 7],
'model__min_samples_split': [2, 5]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical',
'passthrough',
['age',
'campaign',
'pdays',
'previous',
'emp.var.rate',
'cons.price.idx',
'cons.conf.idx',
'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job',
'marital',
'education',
'default',
'housing',
'loan',
'contact',
'month',
'day_of_week',
'poutcome'])])),
('model',
DecisionTreeClassifier(class_weight='balanced'))]),
n_jobs=-1,
param_grid={'model__max_depth': [3, 5, 7],
'model__min_samples_split': [2, 5]},
scoring='accuracy')Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model', DecisionTreeClassifier(class_weight='balanced'))])ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])])['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
passthrough
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
OneHotEncoder(handle_unknown='ignore', sparse=False)
DecisionTreeClassifier(class_weight='balanced')
Let's see our best parameters and score
print(gs.best_params_)
print(gs.best_score_)
{'model__max_depth': 5, 'model__min_samples_split': 2}
0.8509929442344253
dt_model.set_params(**gs.best_params_)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model',
DecisionTreeClassifier(class_weight='balanced', max_depth=5))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model',
DecisionTreeClassifier(class_weight='balanced', max_depth=5))])ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])])['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
passthrough
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
OneHotEncoder(handle_unknown='ignore', sparse=False)
DecisionTreeClassifier(class_weight='balanced', max_depth=5)
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
accuracy_score(y_test, y_pred)
0.8553046856033018
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.95 0.89 0.92 10965
1 0.41 0.62 0.49 1392
accuracy 0.86 12357
macro avg 0.68 0.75 0.70 12357
weighted avg 0.89 0.86 0.87 12357
For Decision Trees, eli5 only gives feature importance, which does not say in what direction a feature impact the predicted outcome.
eli5.show_weights(dt_model.named_steps["model"], feature_names=all_features)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) c:\Users\Ramsey\Downloads\responsible-ai-model-explainability.ipynb Cell 70 line 1 ----> <a href='vscode-notebook-cell:/c%3A/Users/Ramsey/Downloads/responsible-ai-model-explainability.ipynb#Y126sZmlsZQ%3D%3D?line=0'>1</a> eli5.show_weights(dt_model.named_steps["model"], feature_names=all_features) NameError: name 'eli5' is not defined
Here the most important feature seems to be nr.employed. We can also get an explanation for a given prediction, this will calculate the contribution of each feature in the prediction:
eli5.show_prediction(dt_model.named_steps["model"],
dt_model.named_steps["preprocessor"].transform(X_test)[i],
feature_names=all_features, show_feature_values=True)
y=1 (probability 0.961) top features
| Contribution? | Feature | Value |
|---|---|---|
| +0.500 | <BIAS> | 1.000 |
| +0.369 | nr.employed | 4991.600 |
| +0.083 | pdays | 3.000 |
| +0.008 | day_of_week__mon | 0.000 |
| +0.000 | previous | 2.000 |
Here the explanation for a single prediction is calculated by following the decision path in the tree, and adding up contribution of each feature from each node crossed into the overall probability predicted.
eli5 can also be used to explain black box models, but we will use Lime and SHAP for our two last models instead.
LIME stands for Local Interpretable Model-Agnostic Explanations. We can use it with any model we've built in order to explain why it took a specific decision for a given observation. To do so, LIME creates a dataset in the locality of our observation by perturbating the different features. Then it fits a local linear model on this data and uses the weights on each feature to provide an explanation.
gs = GridSearchCV(rf_model, {"model__max_depth": [10, 15],
"model__min_samples_split": [5, 10]},
n_jobs=-1, cv=5, scoring="accuracy")
gs.fit(X_train, y_train)
`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical',
'passthrough',
['age',
'campaign',
'pdays',
'previous',
'emp.var.rate',
'cons.price.idx',
'cons.conf.idx',
'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job',
'marital',
'education',
'default',
'housing',
'loan',
'contact',
'month',
'day_of_week',
'poutcome'])])),
('model',
RandomForestClassifier(class_weight='balanced',
n_jobs=-1))]),
n_jobs=-1,
param_grid={'model__max_depth': [10, 15],
'model__min_samples_split': [5, 10]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical',
'passthrough',
['age',
'campaign',
'pdays',
'previous',
'emp.var.rate',
'cons.price.idx',
'cons.conf.idx',
'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job',
'marital',
'education',
'default',
'housing',
'loan',
'contact',
'month',
'day_of_week',
'poutcome'])])),
('model',
RandomForestClassifier(class_weight='balanced',
n_jobs=-1))]),
n_jobs=-1,
param_grid={'model__max_depth': [10, 15],
'model__min_samples_split': [5, 10]},
scoring='accuracy')Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model',
RandomForestClassifier(class_weight='balanced', n_jobs=-1))])ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])])['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
passthrough
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
OneHotEncoder(handle_unknown='ignore', sparse=False)
RandomForestClassifier(class_weight='balanced', n_jobs=-1)
Let's see our best parameters and score
print(gs.best_params_)
print(gs.best_score_)
{'model__max_depth': 15, 'model__min_samples_split': 5}
0.8738162957985564
rf_model.set_params(**gs.best_params_)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model',
RandomForestClassifier(class_weight='balanced', max_depth=15,
min_samples_split=5, n_jobs=-1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_of_week',
'poutcome'])])),
('model',
RandomForestClassifier(class_weight='balanced', max_depth=15,
min_samples_split=5, n_jobs=-1))])ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])])['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
passthrough
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
OneHotEncoder(handle_unknown='ignore', sparse=False)
RandomForestClassifier(class_weight='balanced', max_depth=15,
min_samples_split=5, n_jobs=-1)rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
accuracy_score(y_test, y_pred)
0.8798252002913328
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.94 0.92 0.93 10965
1 0.47 0.58 0.52 1392
accuracy 0.88 12357
macro avg 0.71 0.75 0.73 12357
weighted avg 0.89 0.88 0.88 12357
We can look at the features importance with Eli5 first:
eli5.show_weights(rf_model.named_steps["model"],
feature_names=all_features)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) c:\Users\Ramsey\Downloads\responsible-ai-model-explainability.ipynb Cell 86 line 1 ----> <a href='vscode-notebook-cell:/c%3A/Users/Ramsey/Downloads/responsible-ai-model-explainability.ipynb#Y151sZmlsZQ%3D%3D?line=0'>1</a> eli5.show_weights(rf_model.named_steps["model"], <a href='vscode-notebook-cell:/c%3A/Users/Ramsey/Downloads/responsible-ai-model-explainability.ipynb#Y151sZmlsZQ%3D%3D?line=1'>2</a> feature_names=all_features) NameError: name 'eli5' is not defined
We can explain roughly what our model seems to focus on mostly. We also get the standard deviation of feature importance accross the multiple trees in our ensemble.
gs = GridSearchCV(xgb_model, {"model__max_depth": [5, 10],
"model__min_child_weight": [5, 10],
"model__n_estimators": [25]},
n_jobs=-1, cv=5, scoring="accuracy")
gs.fit(X_train, y_train)
c:\Users\Ramsey\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\preprocessing\_encoders.py:975: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value. warnings.warn(
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical',
'passthrough',
['age',
'campaign',
'pdays',
'previous',
'emp.var.rate',
'cons.price.idx',
'cons.conf.idx',
'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job',
'marital',
'education',
'default',
'housing',
'l...
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=None,
n_jobs=-1,
num_parallel_tree=None,
random_state=None, ...))]),
n_jobs=-1,
param_grid={'model__max_depth': [5, 10],
'model__min_child_weight': [5, 10],
'model__n_estimators': [25]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical',
'passthrough',
['age',
'campaign',
'pdays',
'previous',
'emp.var.rate',
'cons.price.idx',
'cons.conf.idx',
'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job',
'marital',
'education',
'default',
'housing',
'l...
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=None,
n_jobs=-1,
num_parallel_tree=None,
random_state=None, ...))]),
n_jobs=-1,
param_grid={'model__max_depth': [5, 10],
'model__min_child_weight': [5, 10],
'model__n_estimators': [25]},
scoring='accuracy')Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_...
feature_types=None, gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None,
n_estimators=None, n_jobs=-1,
num_parallel_tree=None, random_state=None, ...))])ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])])['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
passthrough
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
OneHotEncoder(handle_unknown='ignore', sparse=False)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=-1,
num_parallel_tree=None, random_state=None, ...)Let's see our best parameters and score.
print(gs.best_params_)
print(gs.best_score_)
xgb_model.set_params(**gs.best_params_)
xgb_model.fit(X_train, y_train)
{'model__max_depth': 5, 'model__min_child_weight': 5, 'model__n_estimators': 25}
0.9002461587725588
c:\Users\Ramsey\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\preprocessing\_encoders.py:975: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value. warnings.warn(
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_...
feature_types=None, gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=5, max_leaves=None, min_child_weight=5,
missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=25, n_jobs=-1,
num_parallel_tree=None, random_state=None, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays',
'previous', 'emp.var.rate',
'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital',
'education', 'default',
'housing', 'loan', 'contact',
'month', 'day_...
feature_types=None, gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=5, max_leaves=None, min_child_weight=5,
missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=25, n_jobs=-1,
num_parallel_tree=None, random_state=None, ...))])ColumnTransformer(transformers=[('numerical', 'passthrough',
['age', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m',
'nr.employed']),
('categorical',
OneHotEncoder(handle_unknown='ignore',
sparse=False),
['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month',
'day_of_week', 'poutcome'])])['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
passthrough
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
OneHotEncoder(handle_unknown='ignore', sparse=False)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=5, max_leaves=None,
min_child_weight=5, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=25, n_jobs=-1,
num_parallel_tree=None, random_state=None, ...)Generate predictions
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)
0.9015133122926277
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.91 0.99 0.95 10965
1 0.69 0.23 0.34 1392
accuracy 0.90 12357
macro avg 0.80 0.61 0.65 12357
weighted avg 0.88 0.90 0.88 12357
In order to explain why the model classifies invidividual observations as class 0 or 1, we are going to use the LimeTabularExplainer from the library lime, this is the main explainer to use for tabular data. Lime also provides an explainer for text data, for images and for time-series.
When using the tabular explainer, we need to provide our training set as parameter so that lime can compute statistics on each feature, either mean and std for numerical features, or frequency of values for categorical features. Those statistics are used to scale the data and generate new perturbated data to train our local linear models on.
from lime.lime_tabular import LimeTabularExplainer
The parameters passed to the explainer are:
mode: the explainer can be used for classification or regressionfeature_names: list of labels for our featurescategorical_features: list of indexes of categorical featurescategorical_names: dict mapping each index of categorical feature to a list of corresponding labelsdicretize_continuous: will discretize numerical values into buckets that can be used for explanation. For instance it can tell us that the decision was made because distance is in bucket [5km, 10km] instead of telling us distance is an importante feature.First, in order to get the categorical_names parameter we need to build a dictionary with indexes of categorical values in original dataset as keys and lists of possible categories as values:
categorical_names = {}
for col in cat_features:
categorical_names[X_train.columns.get_loc(col)] = [new_col.split("__")[1]
for new_col in new_ohe_features
if new_col.split("__")[0] == col]
categorical_names
{1: ['admin.',
'blue-collar',
'entrepreneur',
'housemaid',
'management',
'retired',
'self-employed',
'services',
'student',
'technician',
'unemployed',
'unknown'],
2: ['divorced', 'married', 'single', 'unknown'],
3: ['basic.4y',
'basic.6y',
'basic.9y',
'high.school',
'illiterate',
'professional.course',
'university.degree',
'unknown'],
4: ['no', 'unknown', 'yes'],
5: ['no', 'unknown', 'yes'],
6: ['no', 'unknown', 'yes'],
7: ['cellular', 'telephone'],
8: ['apr', 'aug', 'dec', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep'],
9: ['fri', 'mon', 'thu', 'tue', 'wed'],
13: ['failure', 'nonexistent', 'success']}
Lime needs the dataset that is passed to have categorical values converted to integer labels that maps to the values in categorical_names. For instance, label 0 for the column 2 will map to divorced. We will use a custom helper function to do so, that converts data from original to LIME and from LIME to original format.
That function is going over all categorical features and replacing strings by the correct integer labels, feel free to check helpers.py.
def convert_to_lime_format(X, categorical_names, col_names=None, invert=False):
"""Converts data with categorical values as string into the right format
for LIME, with categorical values as integers labels.
It takes categorical_names, the same dictionary that has to be passed
to LIME to ensure consistency.
col_names and invert allow to rebuild the original dataFrame from
a numpy array in LIME format to be passed to a Pipeline or sklearn
OneHotEncoder
"""
# If the data isn't a dataframe, we need to be able to build it
if not isinstance(X, pd.DataFrame):
X_lime = pd.DataFrame(X, columns=col_names)
else:
X_lime = X.copy()
for k, v in categorical_names.items():
if not invert:
label_map = {
str_label: int_label for int_label, str_label in enumerate(v)
}
else:
label_map = {
int_label: str_label for int_label, str_label in enumerate(v)
}
X_lime.iloc[:, k] = X_lime.iloc[:, k].map(label_map)
return X_lime
Let's check that it worked:
convert_to_lime_format(X_train, categorical_names).head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7309 | 50 | 1 | 2 | 0 | 0 | 0 | 0 | 1 | 6 | 2 | 2 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.860 | 5191.0 |
| 4169 | 51 | 7 | 1 | 5 | 1 | 0 | 0 | 1 | 6 | 1 | 5 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.858 | 5191.0 |
| 35810 | 46 | 9 | 0 | 5 | 0 | 2 | 0 | 0 | 6 | 1 | 2 | 999 | 0 | 1 | -1.8 | 92.893 | -46.2 | 1.244 | 5099.1 |
| 9668 | 46 | 0 | 2 | 6 | 0 | 2 | 0 | 1 | 4 | 1 | 1 | 999 | 0 | 1 | 1.4 | 94.465 | -41.8 | 4.961 | 5228.1 |
| 34389 | 25 | 7 | 2 | 3 | 0 | 0 | 0 | 0 | 6 | 2 | 5 | 999 | 0 | 1 | -1.8 | 92.893 | -46.2 | 1.266 | 5099.1 |
explainer = LimeTabularExplainer(convert_to_lime_format(X_train, categorical_names).values,
mode="classification",
feature_names=X_train.columns.tolist(),
categorical_names=categorical_names,
categorical_features=categorical_names.keys(),
discretize_continuous=True,
random_state=42)
Great, our explainer is ready. Now let's pick an observation we want to explain.
We'll create a variable called observation that contains our ith observation in the test dataset.
i = 2
X_observation = X_test.iloc[[i], :]
X_observation
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12077 | 35 | technician | single | professional.course | no | no | no | telephone | jun | fri | 1 | 999 | 0 | nonexistent | 1.4 | 94.465 | -41.8 | 4.947 | 5228.1 |
print(f"""\
* True label: {y_test.iloc[i]}
* LR: {lr_model.predict_proba(X_observation)[0]}
* DT: {dt_model.predict_proba(X_observation)[0]}
* RF: {rf_model.predict_proba(X_observation)[0]}
* XGB: {xgb_model.predict_proba(X_observation)[0]}""")
* True label: 0 * LR: [0.71175179 0.28824821] * DT: [0.75848014 0.24151986] * RF: [0.81169052 0.18830948] * XGB: [0.9572216 0.04277838]
Let's convert our observation to lime format and convert it to a numpy array.
observation = convert_to_lime_format(X_test.iloc[[i], :],categorical_names).values[0]
observation
array([35, 9, 2, 5, 0, 0, 0, 1, 4, 0, 1, 999, 0, 1, 1.4, 94.465, -41.8,
4.947, 5228.1], dtype=object)
In order to explain a prediction, we use the explain_instance method on our explainer. This will generate new data with perturbated features around the observation and learn a local linear model. It needs to take:
rf_model.predict_proba because our pipeline expects string labels for categorical values. We will need to create a custom function rf_predict_proba that first converts back integer labels to strings and then calls rf_model.predict_proba.num_features: number of features to consider in explanation# Let write a custom predict_proba functions for our models:
from functools import partial
def custom_predict_proba(X, model):
X_str = convert_to_lime_format(X, categorical_names, col_names=X_train.columns, invert=True)
return model.predict_proba(X_str)
lr_predict_proba = partial(custom_predict_proba, model=lr_model)
dt_predict_proba = partial(custom_predict_proba, model=dt_model)
rf_predict_proba = partial(custom_predict_proba, model=rf_model)
xgb_predict_proba = partial(custom_predict_proba, model=xgb_model)
Let's test our custom function to make sure it generates propabilities properly
explanation = explainer.explain_instance(observation, lr_predict_proba, num_features=5)
Now that we have generated our explanation, we have access to several representations. The most useful one when working in a notebook is show_in_notebook.
On the left it shows the list of probabilities for each class, here the model classified our observation as 0 (non subsribed) with a high probability.
show_table=True, you will see the table with the most important features for this observation on the right.explanation.show_in_notebook(show_table=True, show_all=False)
| Feature | Value |
You can also save the explanation to an html file with save_to_file to share it.
explanation.save_to_file("explanation.html")
LIME is fitting a linear model on a local perturbated dataset. You can access the coefficients, the intercept and the R squared of the linear model by calling respectively .local_exp, .intercept and .score on your explanation.
print(explanation.local_exp)
print(explanation.intercept)
print(explanation.score)
{1: [(14, -0.3370625397424544), (17, 0.1945853046215029), (18, -0.18053443382778087), (15, 0.09880432605075426), (7, -0.06929534733378377)]}
{1: 0.6711143104840434}
0.4127990866113822
# dt_predict_proba
If your R-squared is low, the linear model that LIME fitted isn't a great approximation to your model, which means you should not rely too much on the explanation it provides.
explanation = explainer.explain_instance(observation, dt_predict_proba, num_features=5)
explanation.show_in_notebook(show_table=True, show_all=False)
print(explanation.score)
| Feature | Value |
0.150448667782488
explanation = explainer.explain_instance(observation, rf_predict_proba, num_features=5)
explanation.show_in_notebook(show_table=True, show_all=False)
print(explanation.score)
| Feature | Value |
0.3958189014137279
explanation = explainer.explain_instance(observation, xgb_predict_proba, num_features=5)
explanation.show_in_notebook(show_table=True, show_all=False)
print(explanation.score)
| Feature | Value |
0.31849793426519735
import shap
# Need to load JS vis in the notebook
shap.initjs()
IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
SHAP has a generic explainer that works for any model and a TreeExplainer optimised for tree based models. Here we will focus on the TreeExplainer with our XGB model (the hardest to intepret)
explainer = shap.TreeExplainer(xgb_model.named_steps["model"])
[00:57:31] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0cec3277c4d9d0165-1\xgboost\xgboost-ci-windows\src\c_api\c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.
In order to compute the shapley values with the tree explainer, we need to call the shap_values methods passing a dataset. That can be quite computationally expensive, so we will only pass 1000 samples picked at random.
observations = xgb_model.named_steps["preprocessor"].transform(X_train.sample(1000, random_state=42))
shap_values = explainer.shap_values(observations)
Now we can start visualising our explanations using the force_plot function from the shap package passing our first shap_value (we also need to pass explainer.expected_value which is the base value).
i = 0
shap.force_plot(explainer.expected_value, shap_values[i],
features=observations[i], feature_names=all_features)
This explanation shows how each feature contributes to shifting the prediction from the base value to the output value of the model either by decreasing or increasing the probability of our class.
We can also visualise all points in our dataset at once with a given class by passing all explanations for that class to force_plot
shap.force_plot(explainer.expected_value, shap_values,
features=observations, feature_names=all_features)
We can see our 1000 samples on the x axis. The y-axis corresponds to the same scale we were looking at before, where blue values corresponds to the probability decreasing, red increasing. Hover with your mouse on a point to see the main features impacting a given observation. You can also use the drop down on the left to visualise the impact of specific features, for example duration only.
Another interesting plot that we can generate with SHAP is the summary_plot, it can be seen as a feature importance plot with more meaningful insights. Below we're plotting the summary plot for class 1 on the whole subset.
The colour corresponds to the value of the feature and the x axis corresponds to the SHAP value, meaning the impact on the probability.
shap.summary_plot(shap_values, features=observations, feature_names=all_features)
That's better than the built-in feature importance on RandomForest because not only we can see what features are important but also how they affect our predictions.
shap.dependence_plot("nr.employed", shap_values,
pd.DataFrame(observations, columns=all_features))
The tools we have seen above also work with text data and images. There are plenty of examples available online for text-data. Here we will just demonstrate how to use Lime to explain an image classifier.
Lime can also be used to explain decisions made for image classification.
In this example we will use the pretrained InceptionV3 model available with Keras. Lime is quite slow with images, so it's wiser to stick to a "shallow" deep learning model.
from keras.applications.inception_v3 import InceptionV3, preprocess_input, decode_predictions
from keras.preprocessing.image import load_img, img_to_array
Let's create a new instance of InceptionV3
model = InceptionV3()
Now we'll load a picture of a toucan, we need to make sure we load it at the good size for inception, here 229*229
image_raw = load_img("c:\Users\Ramsey\Downloads\Toucan.png", target_size=(299,299))
image_raw
Cell In[128], line 1 image_raw = load_img("c:\Users\Ramsey\Downloads\Toucan.png", target_size=(299,299)) ^ SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape
image_raw = load_img("c:\\Users\\Ramsey\\Downloads\\Toucan.png", target_size=(299,299))
image_raw
image_raw = load_img(r"c:\Users\Ramsey\Downloads\Toucan.png", target_size=(299,299))
image_raw
We need to process the image to get a numpy array compatible with our model. Here we simply loads it to an array, reshape it and use the preprocess_input method provided by Keras that ensures all the preprocessing steps are made for us.
# Convert to numpy array, reshape and preprocess
image = img_to_array(image_raw)
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
image = preprocess_input(image)
Now that our image is ready, generate predictions by using .predict as usual.
predictions = model.predict(image)
1/1 [==============================] - 0s 382ms/step
You can check what labels your predictions correspond to by calling the function decode_predictions on your predictions. By default it returns the 5 more likely predictions
decode_predictions(predictions)
[[('n01843383', 'toucan', 0.9833592),
('n04146614', 'school_bus', 0.00079244276),
('n01829413', 'hornbill', 0.0005329601),
('n04154565', 'screwdriver', 0.00015542058),
('n07892512', 'red_wine', 0.00010848605)]]
Great, we predicted a toucan with a probability of 99%, that's promising!
Remember that LIME needs the indices of the class we are interested in. Execute the cell bellow to get the indices corresponding to the 5 most probably classes we predicted above. Those indices correspond to the classes used in the ImageNet dataset that was used to train our model.
model.predict(image).argsort()[0, -5:][::-1]
1/1 [==============================] - 1s 969ms/step
array([ 96, 779, 93, 784, 966], dtype=int64)
Here the toucan corresponds to index 96, the school bus to index 779, etc..
Let's get started. First import the LimeImageExplainer and instantiate a new explainer
from lime.lime_image import LimeImageExplainer
explainer = LimeImageExplainer()
The explainer is the same as before, we call explain_instance to generate a new explanation. We need to provide:
top_labels the number of classes to explain. Here our model generate probabilities for more than a 1000 classes (and we looked at the five first). We do not want LIME to generate local models to explain each of those classes. As lime is pretty slow with images, let's only ask for the explanation to our two main classes, toucan and school busnum_samples: the number of new datapoints to create to fit a linear model, let's set it to 1000WARNING: that will be slow.
explanation = explainer.explain_instance(image[0], model.predict,
top_labels=2, num_samples=100,
random_seed=42)
8%|▊ | 8/100 [00:00<00:01, 72.58it/s]
1/1 [==============================] - 5s 5s/step
16%|█▌ | 16/100 [00:05<00:34, 2.46it/s]
1/1 [==============================] - 5s 5s/step
27%|██▋ | 27/100 [00:11<00:30, 2.37it/s]
1/1 [==============================] - 3s 3s/step
38%|███▊ | 38/100 [00:14<00:20, 3.08it/s]
1/1 [==============================] - 3s 3s/step
42%|████▏ | 42/100 [00:18<00:27, 2.08it/s]
1/1 [==============================] - 5s 5s/step
56%|█████▌ | 56/100 [00:23<00:16, 2.62it/s]
1/1 [==============================] - 4s 4s/step
67%|██████▋ | 67/100 [00:29<00:12, 2.59it/s]
1/1 [==============================] - 4s 4s/step
71%|███████ | 71/100 [00:33<00:16, 1.80it/s]
1/1 [==============================] - 3s 3s/step
89%|████████▉ | 89/100 [00:37<00:03, 3.21it/s]
1/1 [==============================] - 3s 3s/step
99%|█████████▉| 99/100 [00:41<00:00, 3.32it/s]
1/1 [==============================] - 4s 4s/step
100%|██████████| 100/100 [00:46<00:00, 2.15it/s]
from skimage.segmentation import mark_boundaries
from matplotlib import pyplot as plt
First let's check the explanation for the predicted class toucan. That corresponds to label 96 in the ImageNet classes. We need to use the method get_image_and_mask on our explanation object with the following parameters:
features in tabular data.That returns a new image and a mask as numpy arrays. You can then use mark_boundaries to show the image together with the mask.
temp, mask = explanation.get_image_and_mask(96, positive_only=True, num_features=5, hide_rest=True)
# plot image and mask together
plt.imshow(mark_boundaries(temp / 2 + 0.5, mask))
<matplotlib.image.AxesImage at 0x19168303a10>
What feature do you expect to be the most important in that decision? Plot the image with only the main feature (num_features=1)
temp, mask = explanation.get_image_and_mask(96, positive_only=True, num_features=1, hide_rest=True)
plt.imshow(mark_boundaries(temp / 2 + 0.5, mask))
<matplotlib.image.AxesImage at 0x1916f817050>
The second class predicted by our model was a bus (label 779), set positive_only=False in order to see what features contributed positively and negatively to that decision. What do you see?
temp, mask = explanation.get_image_and_mask(779, positive_only=False, num_features=8, hide_rest=True)
plt.imshow(mark_boundaries(temp / 2 + 0.5, mask))
<matplotlib.image.AxesImage at 0x19173662a10>
Great, now you can try to change the number of features you're looking at and deactivate positive_only in order to see features that contribute negatively to the class. You can also look at other classes or try other pictures.